* This code merges the DHS (individual and men recode) and standardises the variables
* Input files: 
* 1. DHS Male recodes (MR): GHMR31FL.dta, GHMR41FL.dta, GHMR4BFL.dta, GHMR5AFL.dta, GHMR71FL.dta
* 2. DHS Individual recodes (IR): GHIQ7JFL.dta, GHIR31FL.dta, GHIR41FL.dta, GHIR4BFL.dta, GHIR5AFL.dta, GHIR72FL.dta, GHIR7BFL.dta
* 3. DGS GPS files: corresponding
* Author: Alexander Moradi

* 1. Merging DHS IR
* 2. Merging DHS MR
* 3. Merging DHS 2017 Special
* 4. Merge the files

cd "C:\Users\AMoradi\Dropbox\Economics of Missionary Expansion\Replication Files JEBO\Replication Stata\"


*******************************************************************************************************************************************************
* 1. Merging individual recodes

** ids: caseid v000 v001 v003 v004 v005
** v130 religion
** v106 highest educational level
** v149 educational attainment

clear
clear matrix

set maxvar 10000

local dhs2keep "caseid v000 v001 v002 v003 v004 v005 v007 v008 v009 v010 v011 v012 v013 v104 v106 v108 v130 v131 v133 v149 v150 v190 v191" // This defines the set of variables that we want to keep


********************************************************************************
* Merge Individual Recode files
file open dhs using "Data\DHS/DHS_surveys.txt", read  

file read dhs line
use  "Data\DHS/`line'"
gen dhs_ir="`line'"
keep caseid v* dhs_ir // remove variables because there is a very large set of variables
save master_dhs_ir, replace

file read dhs line
while r(eof)==0 { /* while you're not at the end of the file */
	
	append using "Data/DHS/`line'"
	replace dhs_ir="`line'" if dhs_ir==""
	keep caseid v* dhs_ir // remove variables because there is a very large set of variables
	save master_dhs_ir, replace

	file read dhs line
	}
file close dhs


************** Some DHS variables have a special more detailed religion variable
** s115 in GHIR31FL.dta contains the whole set of religions. s115 was removed from the merging
* replace v130 with s115 in GHIR31FL.dta

append using "Data\DHS\GHIR31FL.dta"
replace dhs_ir="GHIR31FL.dta" if dhs_ir==""

replace v130=9 if v130==5 & dhs_ir=="GHIR31FL.dta"  // Traditional (not included in s115)
replace v130=s115 if v130!=9 & dhs_ir=="GHIR31FL.dta"   & s115!=.
replace v130=0 if s115==10 & dhs_ir=="GHIR31FL.dta"  
// Checked: GHIR41FL.dta also has a s115 variable but it is the same as v130 (except that it includes "other" into no religion)


keep `dhs2keep' dhs_ir

* Create identifier: 
* 1. dhscc - dhs country code (v000)
* 2. dhsyear - dhs year (v007)
* 3. dhsclust - cluster number (v001)

* 1. dhscc - dhs country code
gen dhscc=substr(v000,1,2)

* 2. dhsyear - dhs year
egen dhsyear = mean(v007), by(dhs_ir)
replace dhsyear=round(dhsyear)
replace dhsyear=dhsyear+1900 if dhsyear<=100

* 3. dhsclust - cluster number (v001)
gen dhsclust=v001
********************************************************************************


********************************************************************************
* Merge geographic coordinates
merge m:1 dhscc dhsyear dhsclust using "Data\DHS\master_gps.dta"

* ML 2010 does not have an individual recode
* drop if dhscc=="ML" & dhsyear==2010
keep if _merge==3
drop _merge


* Merge with grid data set(so to retrieve grid cells)
merge m:1 v000 v001 v002 v003 using "Data\DHS\ghana_grid.dta"
keep if _merge==3 //  _merge==1 is where gps data is missing in the DHS files (lat=0, lon=0, alt=9999) and _merge==2 are those where gps does not fit individual recode identifier (N=112)
drop _merge

* DHS_women_geo.dta contains the data of all DHS Individual Recodes with GPS coordinates

* Result: n=20,939 gridcells in cross-section and 33,078 as a panel
* collapse latitude longitude ,by(gridcell dhsyear)
* collapse latitude longitude, by(gridcell)

save "Data/DHS/DHS_women_geo.dta", replace // contains the data of all DHS Individual Recodes with GPS coordinates
*******************************************************************************************************************************************************


*******************************************************************************************************************************************************
** 2. Merge the DHS male recodes

local dhs2keep "mcaseid mv000 mv001 mv002 mv003 mv004 mv005 mv007 mv008 mv009 mv010 mv011 mv012 mv013 mv104 mv106 mv108 mv130 mv131 mv133 mv149 mv150" // This defines the set of variables that we want to keep

** Merge all men recodes that are available for Ghana with GPS coordinates
use "Data\DHS\GHMR31FL.dta", replace
gen dhs_mr="GHMR31FL.dta"

append using "Data\DHS\GHMR41FL.dta"
replace dhs_mr="GHMR41FL.dta" if dhs_mr==""

append using "Data\DHS\GHMR4BFL.dta"
replace dhs_mr="GHMR4BFL.dta" if dhs_mr==""

append using "Data\DHS\GHMR5AFL.dta"
replace dhs_mr="GHMR5AFL.dta" if dhs_mr==""

append using "Data\DHS\GHMR71FL.dta"
replace dhs_mr="GHMR71FL.dta" if dhs_mr==""

keep `dhs2keep' dhs_mr

** Prepare key variables for 1:m merge (with GPS file) 
gen dhscc=substr(mv000,1,2)
egen dhsyear = mean(mv007), by(dhs_mr)
replace dhsyear=round(dhsyear)
replace dhsyear=dhsyear+1900 if dhsyear<=100
gen dhsclust=mv001

* Merge with GPS file coordinates
merge m:1 dhscc dhsyear dhsclust using "Data\DHS\master_gps.dta" // master_gps also includes All-Africa GPS files
keep if _merge==3 // only keep Ghana 
drop _merge

* Merge with gridcell
merge m:1 dhsid using "Data\DHS\master_grid.dta"
drop if _merge==2 //  _merge==1 is where gps data is missing in the DHS files (lat=0, lon=0, alt=9999) and _merge==2 are those where gps does not fit individual recode identifier (N=112)
drop gridcell 
drop _merge

foreach var of varlist `dhs2keep' {
local newname = substr("`var'", 2, .)
rename `var' `newname'
}

* merge m:m v000 v001 using "C:\Users\am401\Dropbox\Missions2\Analysis\data\Ghana DHS_Ghana grid\Ghana DHS_Ghana grid.dta"
merge m:m v000 v001 using "Data\DHS\ghana_grid.dta"
keep if _merge==3
drop _merge

duplicates drop v000 caseid altitude, force

** Standardise religion => Merge with religion coding
replace v000="GH2" if dhsyear==1993
merge m:1 v000 v130 using "Data\DHS\religion_recode_DHS.dta"
keep if _m==3
drop _merge dhs_ir

gen sweight=v005/1000000
label var sweight "Sampling weight"

gen men=1

rename v150 mv150

save "Data\DHS\DHS_men_geo.dta", replace // contains the data of all DHS Individual Recodes with GPS coordinates
*******************************************************************************************************************************************************






*******************************************************************************************************************************************************
* 3. DHS 2017 Special 
********************************************************************************
** Clean 2017 Special survey 
* 1. Standardise variables in the Individual Recode
* 2. Standardise variables in the GPS file (gridcell variable already added using ARCmap) 
* 3. Merge 1 & 2.
* 4. Mege with remaining DHS surveys
 
tempfile dhs_special dhs_special_gps
 
* 1. Standardise variables in the Individual Recode 
use "Data\DHS\GHIQ7JFL.dta", replace
gen dhs_ir="GHIQ7JFL.dta"
gen v000="GH7-S" // country code and phase
rename qhclust v001 // cluster number
gen dhsclust=v001
rename qhnumber v002 // household number
rename qline v003 // individual line number=respondent's line number
* rename v004 // ultimate area unit does not exist
rename qweight v005
rename qinty v007
rename qintc v008
rename q105m v009 // respondent's month of birth = month of birth
replace v009=. if v009==98
rename q105y v010 // respondent's year of birth
replace v010=. if v010==9998
rename q105c v011
rename q106 v012
rename q102 v104
rename q122 v130
gen v004=.
gen v006=qintm
gen v108=1 if q111==3 // reads easily
replace v108=2 if q111==2 // reads with difficult
replace v108=3 if q111==1 // cannot read
rename q123 v131
gen v150=.
gen v190=.
gen v191=.

* v149 // highest educational attainment
gen v149=0 if q107==2 // 0 no education
replace v149=1 if q108==1 & q109<6 // 1 incomplete primary
replace v149=2 if q108==1 & q109==6 // 2 complete primary

replace v149=3 if q108==2 & q109<4 // 3 incomplete secondary (middle school)
replace v149=3 if q108==3 // 3 incomplete secondary (jss/jhs)
replace v149=3 if q108==4 & q109<3 // 3 incomplete secondary (secondary, less than 3 years)
replace v149=3 if q108==5 & q109<3 // 3 incomplete secondary (shs)

replace v149=4 if q108==2 & q109==4 // 4 complete secondary (middle school)
replace v149=4 if q108==4 & q109>=3 // 4 complete secondary, (secondary, more than 3 years)
replace v149=4 if q108==5 & q109>=3 // 4 complete secondary, (shs 3 or 4 years)

replace v149=5 if q108==6 // 5 higher

* v106 - Highest educational level
gen v106=0 if v149==0   // highest educational level: no education, primary, secondary, higher
replace v106=1 if v149==1 | v149==2
replace v106=2 if v149==3 | v149==4
replace v106=3 if v149==5

* v133 - Years of education
gen v133=0 if v106==0
replace v133=q109 if q108==1 & v133==. // primary education
replace v133=q109+7  if (q108==2 | q108==3 | q108==4) & v133==. // secondary
replace v133=q109+10  if q108==5 & v133==. // secondary
replace v133=q109+14 if v106==3 & v133==. // higher

keep v*  dhsclust dhs_ir
save `dhs_special'


* 2. Import the 2017 GPS file, gridcell info has been added using Arcmap (gridcell col_name row_name centrroidx centrroidy)
import excel "Data\DHS\GHGE7IFL.xls", sheet("Export_Output") firstrow case(lower) clear

* variables
gen urban=(urban_rura=="R")
rename longnum longitude
rename latnum latitude
replace latitude=. if latitude==0 & longitude==0
replace longitude=. if latitude==.
gen altitude=alt_dem
replace altitude=. if altitude==9999

keep dhscc dhsyear dhsclust dhsid latitude longitude altitude urban gridcell col_name row_name centrroidx centrroidy

save `dhs_special_gps'


* 3. Merge 1 & 2.
use `dhs_special'
merge m:1 dhsclust using `dhs_special_gps'
drop _merge
drop if gridcell==""
save "Data\DHS\dhs_special_gps.dta", replace
*******************************************************************************************************************************************************



*******************************************************************************************************************************************************
* 4. Merge the files

use "Data\DHS\DHS_women_geo.dta", replace
replace v010=v010+1900 if v010<100 // correct year of birth in order to XXX format   

// Append the 2017 Special Survey
append using "Data\DHS\dhs_special_gps.dta"

** Merge with religion coding
merge m:1 v000 v130 using "Data\DHS\religion_recode_DHS.dta"
drop if _m!=3
drop _merge

gen sweight=v005/1000000
label var sweight "Sampling weight"
gen men=0

* Denormalizing weights according to https://userforum.dhsprogram.com/index.php?t=getfile&id=4& "Note on DHS standard weight de-normalization By Ruilin Ren, PhD. Senior sampling statistician, ICF International"
* The normalized weight is a relative weight which is valid for estimating means, proportions and ratios, but not valid for estimating totals, and not valid for pooled data from different surveys. If one needs to estimate population totals based on DHS data or to pool data from different surveys together, the standard weight should be de-normalized.

* Append the male dataset
append using "Data\DHS\DHS_men_geo.dta"


***********************************
** Housekeeping
* Age 5-year groups
label define v013 8 "50-54", modify
label define v013 9 "55-59", modify
forval i=1(1)9 {
replace v013=`i' if v013==. & v012>=10+(`i'*5) & v012<15+(`i'*5)
}

label define v150 13 "niece by blood" 14 "niece by marriage", modify
label define mv150 1 "head" 2	"husband" 3	"son" 4 "son-in-law"	5	"grand-son" 6	"father"	8	"brother" 10	"other relative" 11	"adopted/foster child"	12	"not related" 13 "nephew by blood" 14	"nephew by marriage", modify

order dhs_mr, after( dhs_ir)
label var dhs_mr "DHS Male Recode"
label var dhs_ir "DHS Individual Recode"

label var v006 "month of interview"
label var sweight "sampling weight (v005/1000000)"

rename men male
order male, after (v013)

order sweight v006, after (v005)
order mv150, after (v150)
***********************************

* V005*=V005×(total females age 15-49 in the country at the time of the survey)/(number of women age 15-49 interviewed in the survey)
save "Data\dhs_religion_individual.dta", replace


********************************************************************************
* Calculate percentage of religion by gridcell

** Muslim/Christian from religion3
gen christian=(religion3=="Christian")
gen muslim=(religion3=="Muslim")
gen traditional=(religion3=="Traditional")
gen noreligion=(religion3=="no religion")

** Protestant/Catholic from religion3
gen protestant=(religion2=="Protestant")
replace protestant=. if religion2==""
gen catholic=(religion2=="Catholic")
replace catholic=. if religion2==""

* Presbyterian/methodist from religion1
gen presbyterian=( religion1=="Presbyterian")
replace presbyterian=. if religion1==""
gen methodist=( religion1=="Methodist")
replace methodist=. if religion1==""
gen pentecost_other=(religion1=="Pentecost\Other Christian")
replace pentecost_other=. if religion1==""

* Pentecostals from religion0
gen pentecost=(religion0=="Pentecost")
replace pentecost=. if religion0==""

* Education
gen prim_education=(v106==1)
gen sec_education=(v106==2)


collapse (mean) christian muslim traditional noreligion protestant catholic presbyterian methodist pentecost_other pentecost longitude latitude altitude urban v133 prim_education sec_education [pw=v005/1000000], by (v000 dhsyear grid)
label var pentecost_other "Pentecostal & Other Christian"

save "Data\dhs_religion_grid.dta", replace
